InΒ [2]:
import pandas as pd
from helper import get_my_dataset
import seaborn as sns
import matplotlib.pyplot as plt
InΒ [3]:
"""
Get the data_set filtered for my borough
"""
main_df = get_my_dataset()
InΒ [4]:
"""
Checking the rows and columns of the dataset
"""
main_df.shape
Out[4]:
(2070, 32)
InΒ [5]:
main_df.dtypes
Out[5]:
Accident_Index                                  object
Location_Easting_OSGR                            int64
Location_Northing_OSGR                           int64
Longitude                                      float64
Latitude                                       float64
Police_Force                                     int64
Accident_Severity                                int64
Number_of_Vehicles                               int64
Number_of_Casualties                             int64
Date                                            object
Day_of_Week                                      int64
Time                                            object
Local_Authority_District                         int64
Local_Authority_Highway                         object
1st_Road_Class                                   int64
1st_Road_Number                                  int64
Road_Type                                        int64
Speed_limit                                      int64
Junction_Detail                                  int64
Junction_Control                                 int64
2nd_Road_Class                                   int64
2nd_Road_Number                                  int64
Pedestrian_Crossing-Human_Control                int64
Pedestrian_Crossing-Physical_Facilities          int64
Light_Conditions                                 int64
Weather_Conditions                               int64
Road_Surface_Conditions                          int64
Special_Conditions_at_Site                       int64
Carriageway_Hazards                              int64
Urban_or_Rural_Area                              int64
Did_Police_Officer_Attend_Scene_of_Accident      int64
LSOA_of_Accident_Location                       object
dtype: object
InΒ [6]:
"""
Function to change data types according to manually updated data-types.csv.
"""
def process_dataframe(df, csv_path):
    # Read the CSV file
    instructions = pd.read_csv(csv_path)
    
    # Iterate over the instructions
    for index, row in instructions.iterrows():
        col_name = row.iloc[0]
        new_type = row.iloc[1]
        
        # Change the data type of the column
        if new_type == "numerical":
            df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
        elif new_type == "date":
            df[col_name] = pd.to_datetime(df[col_name], dayfirst=True, format='%d/%m/%Y', errors='coerce')
        elif new_type == "time":
            df[col_name] = pd.to_datetime(df[col_name], format='%H:%M').dt.time
        elif new_type == "categorical":
            df[col_name] = df[col_name].astype('str')
    
    return df

# Example usage
processed_df = process_dataframe(main_df, 'data_types.csv')
processed_df.dtypes
Out[6]:
Accident_Index                                         object
Location_Easting_OSGR                                   int64
Location_Northing_OSGR                                  int64
Longitude                                             float64
Latitude                                              float64
Police_Force                                           object
Accident_Severity                                      object
Number_of_Vehicles                                      int64
Number_of_Casualties                                    int64
Date                                           datetime64[ns]
Day_of_Week                                            object
Time                                                   object
Local_Authority_District                               object
Local_Authority_Highway                                object
1st_Road_Class                                         object
1st_Road_Number                                        object
Road_Type                                              object
Speed_limit                                            object
Junction_Detail                                        object
Junction_Control                                       object
2nd_Road_Class                                         object
2nd_Road_Number                                        object
Pedestrian_Crossing-Human_Control                      object
Pedestrian_Crossing-Physical_Facilities                object
Light_Conditions                                       object
Weather_Conditions                                     object
Road_Surface_Conditions                                object
Special_Conditions_at_Site                             object
Carriageway_Hazards                                    object
Urban_or_Rural_Area                                    object
Did_Police_Officer_Attend_Scene_of_Accident            object
LSOA_of_Accident_Location                              object
dtype: object
InΒ [7]:
""" 
Checking missing values
"""
# Define all conditions for missing or invalid values
conditions = [
    processed_df.isnull(),                # NaN or None
    processed_df.isna(),                  # Alias for isnull()
    (processed_df == 'Nan'),              # String 'Nan'
    (processed_df == 'nan'),              # String 'nan'
    (processed_df == pd.NA),              # Pandas NA
    (processed_df == '-1'),               # String '-1'
    (processed_df == ''),                 # Empty string
    (processed_df == 'NULL'),             # String 'NULL'
    (processed_df == None)                # Python None
]

# Combine all conditions using logical OR
combined_mask = conditions[0]
for condition in conditions[1:]:
    combined_mask |= condition

# Count missing values per column
missing_values = combined_mask.sum()

# Print the results
print(missing_values)
Accident_Index                                   0
Location_Easting_OSGR                            0
Location_Northing_OSGR                           0
Longitude                                        0
Latitude                                         0
Police_Force                                     0
Accident_Severity                                0
Number_of_Vehicles                               0
Number_of_Casualties                             0
Date                                             0
Day_of_Week                                      0
Time                                             0
Local_Authority_District                         0
Local_Authority_Highway                          0
1st_Road_Class                                   0
1st_Road_Number                                  0
Road_Type                                        0
Speed_limit                                      0
Junction_Detail                                  0
Junction_Control                               570
2nd_Road_Class                                 570
2nd_Road_Number                                  0
Pedestrian_Crossing-Human_Control                0
Pedestrian_Crossing-Physical_Facilities          0
Light_Conditions                                 0
Weather_Conditions                               0
Road_Surface_Conditions                          0
Special_Conditions_at_Site                       0
Carriageway_Hazards                              0
Urban_or_Rural_Area                              0
Did_Police_Officer_Attend_Scene_of_Accident      0
LSOA_of_Accident_Location                        4
dtype: int64
InΒ [8]:
# Create a heatmap to visualize the missing values
plt.figure(figsize=(12, 8))
sns.heatmap(combined_mask, cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()
No description has been provided for this image
InΒ [9]:
#Outliers
print(processed_df.shape)
# Select numeric variables
my_data_numeric = processed_df.select_dtypes(include='number')
print('The dataset has', len(my_data_numeric.columns),'Numeric variables:')
print(my_data_numeric)

# Select categorical variables
my_data_categorical = processed_df.select_dtypes(include='object')
print('The dataset has', len(my_data_categorical.columns),'Categorical variables:')
print(my_data_categorical)
(2070, 32)
The dataset has 6 Numeric variables:
      Location_Easting_OSGR  Location_Northing_OSGR  Longitude   Latitude  \
0                    521760                  169840  -0.250492  51.414457   
1                    518140                  165370  -0.304009  51.375047   
2                    519130                  163980  -0.290258  51.362347   
3                    520840                  167190  -0.264617  51.390837   
4                    521280                  172110  -0.256614  51.434962   
...                     ...                     ...        ...        ...   
2065                 517330                  161850  -0.316802  51.343577   
2066                 517870                  169360  -0.306563  51.410965   
2067                 519370                  169000  -0.285126  51.407416   
2068                 519330                  169110  -0.285664  51.408413   
2069                 521830                  169870  -0.249475  51.414712   

      Number_of_Vehicles  Number_of_Casualties  
0                      1                     1  
1                      3                     1  
2                      2                     2  
3                      3                     1  
4                      2                     1  
...                  ...                   ...  
2065                   2                     1  
2066                   2                     1  
2067                   2                     1  
2068                   1                     1  
2069                   2                     1  

[2070 rows x 6 columns]
The dataset has 25 Categorical variables:
     Accident_Index Police_Force Accident_Severity Day_of_Week      Time  \
0     200501TE00016            1                 3           5  07:25:00   
1     200501TE00017            1                 3           3  08:35:00   
2     200501TE00026            1                 2           1  01:17:00   
3     200501TE00039            1                 3           1  18:00:00   
4     200501TE00060            1                 2           6  06:20:00   
...             ...          ...               ...         ...       ...   
2065  201001VK30404            1                 3           3  14:45:00   
2066  201001VK30408            1                 3           4  16:10:00   
2067  201001VK30420            1                 3           2  08:00:00   
2068  201001VK39057            1                 2           2  18:45:00   
2069  201001VW40202            1                 3           5  18:00:00   

     Local_Authority_District Local_Authority_Highway 1st_Road_Class  \
0                          23               E09000021              3   
1                          23               E09000021              3   
2                          23               E09000021              5   
3                          23               E09000021              3   
4                          23               E09000021              3   
...                       ...                     ...            ...   
2065                       23               E09000021              3   
2066                       23               E09000021              6   
2067                       23               E09000021              6   
2068                       23               E09000021              3   
2069                       23               E09000021              3   

     1st_Road_Number Road_Type  ... Pedestrian_Crossing-Human_Control  \
0                  3         6  ...                                 0   
1                  3         3  ...                                 0   
2                  0         3  ...                                 0   
3                  3         3  ...                                 0   
4                  3         3  ...                                 0   
...              ...       ...  ...                               ...   
2065             243         3  ...                                 0   
2066               0         6  ...                                 0   
2067               0         6  ...                                 0   
2068            2043         6  ...                                 0   
2069             238         6  ...                                 0   

     Pedestrian_Crossing-Physical_Facilities Light_Conditions  \
0                                          0                4   
1                                          0                1   
2                                          0                4   
3                                          0                4   
4                                          0                4   
...                                      ...              ...   
2065                                       0                1   
2066                                       5                4   
2067                                       0                1   
2068                                       4                1   
2069                                       0                1   

     Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site  \
0                     1                       2                          0   
1                     1                       2                          0   
2                     1                       1                          0   
3                     1                       1                          0   
4                     1                       1                          0   
...                 ...                     ...                        ...   
2065                  1                       1                          0   
2066                  1                       1                          0   
2067                  2                       2                          0   
2068                  1                       1                          0   
2069                  1                       1                          0   

     Carriageway_Hazards Urban_or_Rural_Area  \
0                      0                   1   
1                      0                   1   
2                      0                   1   
3                      0                   1   
4                      0                   1   
...                  ...                 ...   
2065                   0                   2   
2066                   0                   1   
2067                   0                   1   
2068                   6                   1   
2069                   0                   1   

     Did_Police_Officer_Attend_Scene_of_Accident LSOA_of_Accident_Location  
0                                              1                 E01002953  
1                                              1                 E01003002  
2                                              1                 E01002946  
3                                              1                 E01002981  
4                                              1                 E01002955  
...                                          ...                       ...  
2065                                           2                 E01002948  
2066                                           1                 E01002968  
2067                                           2                 E01002969  
2068                                           1                 E01002970  
2069                                           1                 E01002953  

[2070 rows x 25 columns]
InΒ [10]:
# Outliers - For numerics
fig = plt.figure(figsize= (15,30))

for i in range(len(my_data_numeric.columns)):
    column = my_data_numeric.columns[i]
    sub = fig.add_subplot(8,3,i+1)
    chart = sns.boxplot(data=my_data_numeric, y=column, color='g')
No description has been provided for this image
InΒ [11]:
# Use histogram for all the numeical variables
fig = plt.figure(figsize= (15,30))

for i in range(len(my_data_numeric.columns)):
    column = my_data_numeric.columns[i]
    sub = fig.add_subplot(8,3, i+1)
    chart = sns.histplot(data=my_data_numeric, x=column, bins=50, color='g', kde=False)
No description has been provided for this image
InΒ [12]:
# Outliers - for categorical values
# Number of categorical variables
num_categorical = len(my_data_categorical.columns)

# Create a figure with enough rows
fig = plt.figure(figsize=(15, num_categorical * 5))  # Adjust height dynamically

for i, column in enumerate(my_data_categorical.columns):
    sub = fig.add_subplot(num_categorical, 1, i+1)  # Use the total number of plots
    sns.countplot(data=processed_df, x=column, color='y', ax=sub)  # Use ax=sub for subplot integration
    sub.set_title(f"Count Plot for {column}", fontsize=12)
    plt.xticks(rotation=45)  # Rotate x-axis labels if necessary for clarity

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()
No description has been provided for this image
InΒ [13]:
#Tackle data quality issues
# Drop duplicates
processed_df = processed_df[processed_df.columns[~processed_df.columns.isin(['Location_Easting_OSGR', 'Location_Northing_OSGR'])]] # Drop two variables TOWN_No and CMEDV
processed_df
Out[13]:
Accident_Index Longitude Latitude Police_Force Accident_Severity Number_of_Vehicles Number_of_Casualties Date Day_of_Week Time ... Pedestrian_Crossing-Human_Control Pedestrian_Crossing-Physical_Facilities Light_Conditions Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site Carriageway_Hazards Urban_or_Rural_Area Did_Police_Officer_Attend_Scene_of_Accident LSOA_of_Accident_Location
0 200501TE00016 -0.250492 51.414457 1 3 1 1 2005-01-13 5 07:25:00 ... 0 0 4 1 2 0 0 1 1 E01002953
1 200501TE00017 -0.304009 51.375047 1 3 3 1 2005-01-18 3 08:35:00 ... 0 0 1 1 2 0 0 1 1 E01003002
2 200501TE00026 -0.290258 51.362347 1 2 2 2 2005-01-09 1 01:17:00 ... 0 0 4 1 1 0 0 1 1 E01002946
3 200501TE00039 -0.264617 51.390837 1 3 3 1 2005-01-23 1 18:00:00 ... 0 0 4 1 1 0 0 1 1 E01002981
4 200501TE00060 -0.256614 51.434962 1 2 2 1 2005-01-21 6 06:20:00 ... 0 0 4 1 1 0 0 1 1 E01002955
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2065 201001VK30404 -0.316802 51.343577 1 3 2 1 2010-12-14 3 14:45:00 ... 0 0 1 1 1 0 0 2 2 E01002948
2066 201001VK30408 -0.306563 51.410965 1 3 2 1 2010-12-22 4 16:10:00 ... 0 5 4 1 1 0 0 1 1 E01002968
2067 201001VK30420 -0.285126 51.407416 1 3 2 1 2010-11-08 2 08:00:00 ... 0 0 1 2 2 0 0 1 2 E01002969
2068 201001VK39057 -0.285664 51.408413 1 2 1 1 2010-05-17 2 18:45:00 ... 0 4 1 1 1 0 6 1 1 E01002970
2069 201001VW40202 -0.249475 51.414712 1 3 2 1 2010-06-17 5 18:00:00 ... 0 0 1 1 1 0 0 1 1 E01002953

2070 rows Γ— 30 columns

InΒ [14]:
my_data_no_missing = processed_df.dropna()
my_data_no_missing.shape
Out[14]:
(2070, 30)
InΒ [15]:
numerical_cols = ['Number_of_Casualties', 'Longitude', 'Latitude']

# Remove outliers using IQR
outliers_index = []
for col in numerical_cols:
    Q1 = my_data_no_missing[col].quantile(0.25)
    Q3 = my_data_no_missing[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_index += my_data_no_missing.index[(my_data_no_missing[col] < lower_bound) | (my_data_no_missing[col] > upper_bound)].tolist()

# Drop rows with outliers
outliers_index = set(outliers_index)
my_data_cleaned = my_data_no_missing.drop(index=outliers_index)
my_data_cleaned.shape
Out[15]:
(1705, 30)
InΒ [16]:
#analysis without outlier removing
light_conditions_count = my_data_cleaned['Light_Conditions'].value_counts(normalize=True)
print("Proportion of Light Conditions:")
print(light_conditions_count)

# Example: Analyzing Weather Conditions
weather_conditions_count = my_data_cleaned['Weather_Conditions'].value_counts(normalize=True)
print("Proportion of Weather Conditions:")
print(weather_conditions_count)
Proportion of Light Conditions:
Light_Conditions
1    0.777713
4    0.221114
7    0.000587
6    0.000587
Name: proportion, dtype: float64
Proportion of Weather Conditions:
Weather_Conditions
1    0.889736
2    0.080938
8    0.015249
4    0.006452
5    0.002933
3    0.002346
7    0.001760
9    0.000587
Name: proportion, dtype: float64
InΒ [17]:
# Plot for Light_Conditions
plt.figure(figsize=(8, 5))
light_conditions_count.plot(kind='bar', color='skyblue', alpha=0.8)
plt.title('Distribution of Light Conditions')
plt.xlabel('Light Conditions')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot for Weather_Conditions
plt.figure(figsize=(8, 5))
weather_conditions_count.plot(kind='bar', color='orange', alpha=0.8)
plt.title('Distribution of Weather Conditions')
plt.xlabel('Weather Conditions')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
InΒ [18]:
# Filter rows for serious and fatal accidents
serious_accidents = my_data_cleaned[my_data_cleaned['Accident_Severity'].isin(['Serious', 'Fatal'])]

# Analyze Weather Conditions for serious accidents
serious_weather = serious_accidents['Weather_Conditions'].value_counts(normalize=True)
print("Weather Conditions for Serious/Fatal Accidents:")
print(serious_weather)

# Analyze Road Surface Conditions for serious accidents
serious_road_surface = serious_accidents['Road_Surface_Conditions'].value_counts(normalize=True)
print("Road Surface Conditions for Serious/Fatal Accidents:")
print(serious_road_surface)
Weather Conditions for Serious/Fatal Accidents:
Series([], Name: proportion, dtype: float64)
Road Surface Conditions for Serious/Fatal Accidents:
Series([], Name: proportion, dtype: float64)
InΒ [19]:
# Plot accident hotspots
plt.figure(figsize=(10, 6))
plt.scatter(my_data_cleaned['Longitude'], my_data_cleaned['Latitude'], alpha=0.5, s=10)
plt.title("Accident Hotspots")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()
No description has been provided for this image
InΒ [20]:
import folium

# Create a base map centered around the mean location
center_lat = my_data_cleaned['Latitude'].mean()
center_lon = my_data_cleaned['Longitude'].mean()
map_hotspots = folium.Map(location=[center_lat, center_lon], zoom_start=13)

# Add individual markers for each accident location
for _, row in my_data_cleaned.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=2,  # Radius of the marker
        color='blue',  # Border color of the circle
        fill=True,
        fill_color='blue',  # Fill color of the circle
        fill_opacity=0.6  # Opacity of the marker
    ).add_to(map_hotspots)

# Display the map
map_hotspots
Out[20]:
Make this Notebook Trusted to load map: File -> Trust Notebook
InΒ [21]:
import folium
from folium.plugins import MarkerCluster

# Create a base map centered around the mean latitude and longitude
center_lat = my_data_cleaned['Latitude'].mean()
center_lon = my_data_cleaned['Longitude'].mean()
map_hotspots = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Create a marker cluster to group nearby accidents
marker_cluster = MarkerCluster().add_to(map_hotspots)

# Add markers for each accident location
for _, row in my_data_cleaned.iterrows():
    folium.Marker(location=[row['Latitude'], row['Longitude']]).add_to(marker_cluster)

# Display the map
map_hotspots
Out[21]:
Make this Notebook Trusted to load map: File -> Trust Notebook
InΒ [22]:
print(my_data_cleaned.shape)
fig = plt.figure(figsize=(15, 30))

# Filter numeric columns
numeric_cols = my_data_cleaned.select_dtypes(include='number').columns

for i, column in enumerate(numeric_cols):
    sub = fig.add_subplot(8, 3, i + 1)
    chart = sns.histplot(data=my_data_cleaned, x=column, bins=50, color='g', kde=False)

plt.tight_layout()
plt.show()
(1705, 30)
No description has been provided for this image
InΒ [23]:
# Plot Boxplot after IQR applied
fig = plt.figure(figsize=(15, 30))

my_data = my_data_cleaned.select_dtypes(include='number')

for i in range(len(my_data.columns)):
    column = my_data.columns[i]
    sub = fig.add_subplot(8, 3, i + 1)
    chart = sns.boxplot(data=my_data, y=column, color='y')
No description has been provided for this image
InΒ [24]:
# List of categorical columns
categorical_cols = [
    'Police_Force', 'Accident_Severity', 'Day_of_Week', 'Local_Authority_District',
    'Local_Authority_Highway', '1st_Road_Class', 'Road_Type', 'Speed_limit',
    'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
    'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities',
    'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions',
    'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Urban_or_Rural_Area',
    'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location'
]

# Analyze and visualize distributions
for col in categorical_cols:
    print(f"Class distribution for {col}:")
    print(my_data_cleaned[col].value_counts(normalize=True) * 100)  # Print proportions
    print()

    # Plot class distribution
    plt.figure(figsize=(8, 4))
    my_data_cleaned[col].value_counts(normalize=True).plot(
        kind='bar', color='skyblue', alpha=0.8
    )
    plt.title(f"Class Distribution for {col}")
    plt.xlabel("Class")
    plt.ylabel("Proportion (%)")
    plt.xticks(rotation=45)
    plt.show()
Class distribution for Police_Force:
Police_Force
1    100.0
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Accident_Severity:
Accident_Severity
3    85.161290
2    14.310850
1     0.527859
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Day_of_Week:
Day_of_Week
6    16.304985
4    16.011730
2    15.835777
5    15.601173
3    15.307918
7    12.023460
1     8.914956
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Local_Authority_District:
Local_Authority_District
23    100.0
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Local_Authority_Highway:
Local_Authority_Highway
E09000021    100.0
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for 1st_Road_Class:
1st_Road_Class
3    65.337243
6    15.835777
5    11.495601
4     7.331378
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Road_Type:
Road_Type
6    72.316716
3    16.363636
1     6.334311
2     3.167155
7     1.759531
9     0.058651
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Speed_limit:
Speed_limit
30    93.782991
50     4.340176
40     1.231672
20     0.645161
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Junction_Detail:
Junction_Detail
3    44.574780
0    27.096774
6    10.146628
1     8.328446
8     4.164223
7     2.346041
5     1.759531
2     1.466276
9     0.117302
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Junction_Control:
Junction_Control
4     58.299120
-1    27.096774
2     14.545455
1      0.058651
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for 2nd_Road_Class:
2nd_Road_Class
6     45.102639
-1    27.096774
3     14.838710
5      7.683284
4      5.278592
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Pedestrian_Crossing-Human_Control:
Pedestrian_Crossing-Human_Control
0    99.882698
2     0.117302
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Pedestrian_Crossing-Physical_Facilities:
Pedestrian_Crossing-Physical_Facilities
0    77.184751
5     9.208211
1     7.448680
4     5.630499
8     0.410557
7     0.117302
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Light_Conditions:
Light_Conditions
1    77.771261
4    22.111437
7     0.058651
6     0.058651
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Weather_Conditions:
Weather_Conditions
1    88.973607
2     8.093842
8     1.524927
4     0.645161
5     0.293255
3     0.234604
7     0.175953
9     0.058651
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Road_Surface_Conditions:
Road_Surface_Conditions
1    83.284457
2    15.542522
4     0.938416
3     0.234604
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Special_Conditions_at_Site:
Special_Conditions_at_Site
0    98.592375
4     0.703812
6     0.469208
5     0.058651
7     0.058651
1     0.058651
3     0.058651
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Carriageway_Hazards:
Carriageway_Hazards
0    98.944282
7     0.410557
2     0.293255
1     0.175953
6     0.117302
3     0.058651
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Urban_or_Rural_Area:
Urban_or_Rural_Area
1    97.302053
2     2.697947
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for Did_Police_Officer_Attend_Scene_of_Accident:
Did_Police_Officer_Attend_Scene_of_Accident
1    81.935484
2    17.302053
3     0.762463
Name: proportion, dtype: float64

No description has been provided for this image
Class distribution for LSOA_of_Accident_Location:
LSOA_of_Accident_Location
E01002968    11.436950
E01002984     3.460411
E01002948     3.343109
E01003003     2.932551
E01003004     2.639296
               ...    
E01030388     0.058651
E01003445     0.058651
E01002937     0.058651
E01003817     0.058651
E01002923     0.058651
Name: proportion, Length: 100, dtype: float64

No description has been provided for this image
InΒ [25]:
#Explore basic statistics of each attribute
categorical_cols = [
    'Police_Force', 'Accident_Severity', 'Day_of_Week', 'Local_Authority_District',
    'Local_Authority_Highway', '1st_Road_Class', 'Road_Type', 'Speed_limit',
    'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
    'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities',
    'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions',
    'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Urban_or_Rural_Area',
    'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location'
]

for col in categorical_cols:
    print(f"Statistics for {col}:\n")
    
    # Frequency counts and proportions
    counts = my_data_cleaned[col].value_counts()
    proportions = my_data_cleaned[col].value_counts(normalize=True)
    
    # Mode (most frequent value)
    mode = my_data_cleaned[col].mode()[0]
    
    print(f"Counts:\n{counts}")
    print(f"Proportions (%):\n{proportions * 100}")
    print(f"Mode: {mode}\n")
    
    # Visualization: Bar plot for class distributions
    plt.figure(figsize=(8, 4))
    counts.plot(kind='bar', color='skyblue', alpha=0.8)
    plt.title(f"Class Distribution for {col}")
    plt.xlabel("Class")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.show()
Statistics for Police_Force:

Counts:
Police_Force
1    1705
Name: count, dtype: int64
Proportions (%):
Police_Force
1    100.0
Name: proportion, dtype: float64
Mode: 1

No description has been provided for this image
Statistics for Accident_Severity:

Counts:
Accident_Severity
3    1452
2     244
1       9
Name: count, dtype: int64
Proportions (%):
Accident_Severity
3    85.161290
2    14.310850
1     0.527859
Name: proportion, dtype: float64
Mode: 3

No description has been provided for this image
Statistics for Day_of_Week:

Counts:
Day_of_Week
6    278
4    273
2    270
5    266
3    261
7    205
1    152
Name: count, dtype: int64
Proportions (%):
Day_of_Week
6    16.304985
4    16.011730
2    15.835777
5    15.601173
3    15.307918
7    12.023460
1     8.914956
Name: proportion, dtype: float64
Mode: 6

No description has been provided for this image
Statistics for Local_Authority_District:

Counts:
Local_Authority_District
23    1705
Name: count, dtype: int64
Proportions (%):
Local_Authority_District
23    100.0
Name: proportion, dtype: float64
Mode: 23

No description has been provided for this image
Statistics for Local_Authority_Highway:

Counts:
Local_Authority_Highway
E09000021    1705
Name: count, dtype: int64
Proportions (%):
Local_Authority_Highway
E09000021    100.0
Name: proportion, dtype: float64
Mode: E09000021

No description has been provided for this image
Statistics for 1st_Road_Class:

Counts:
1st_Road_Class
3    1114
6     270
5     196
4     125
Name: count, dtype: int64
Proportions (%):
1st_Road_Class
3    65.337243
6    15.835777
5    11.495601
4     7.331378
Name: proportion, dtype: float64
Mode: 3

No description has been provided for this image
Statistics for Road_Type:

Counts:
Road_Type
6    1233
3     279
1     108
2      54
7      30
9       1
Name: count, dtype: int64
Proportions (%):
Road_Type
6    72.316716
3    16.363636
1     6.334311
2     3.167155
7     1.759531
9     0.058651
Name: proportion, dtype: float64
Mode: 6

No description has been provided for this image
Statistics for Speed_limit:

Counts:
Speed_limit
30    1599
50      74
40      21
20      11
Name: count, dtype: int64
Proportions (%):
Speed_limit
30    93.782991
50     4.340176
40     1.231672
20     0.645161
Name: proportion, dtype: float64
Mode: 30

No description has been provided for this image
Statistics for Junction_Detail:

Counts:
Junction_Detail
3    760
0    462
6    173
1    142
8     71
7     40
5     30
2     25
9      2
Name: count, dtype: int64
Proportions (%):
Junction_Detail
3    44.574780
0    27.096774
6    10.146628
1     8.328446
8     4.164223
7     2.346041
5     1.759531
2     1.466276
9     0.117302
Name: proportion, dtype: float64
Mode: 3

No description has been provided for this image
Statistics for Junction_Control:

Counts:
Junction_Control
4     994
-1    462
2     248
1       1
Name: count, dtype: int64
Proportions (%):
Junction_Control
4     58.299120
-1    27.096774
2     14.545455
1      0.058651
Name: proportion, dtype: float64
Mode: 4

No description has been provided for this image
Statistics for 2nd_Road_Class:

Counts:
2nd_Road_Class
6     769
-1    462
3     253
5     131
4      90
Name: count, dtype: int64
Proportions (%):
2nd_Road_Class
6     45.102639
-1    27.096774
3     14.838710
5      7.683284
4      5.278592
Name: proportion, dtype: float64
Mode: 6

No description has been provided for this image
Statistics for Pedestrian_Crossing-Human_Control:

Counts:
Pedestrian_Crossing-Human_Control
0    1703
2       2
Name: count, dtype: int64
Proportions (%):
Pedestrian_Crossing-Human_Control
0    99.882698
2     0.117302
Name: proportion, dtype: float64
Mode: 0

No description has been provided for this image
Statistics for Pedestrian_Crossing-Physical_Facilities:

Counts:
Pedestrian_Crossing-Physical_Facilities
0    1316
5     157
1     127
4      96
8       7
7       2
Name: count, dtype: int64
Proportions (%):
Pedestrian_Crossing-Physical_Facilities
0    77.184751
5     9.208211
1     7.448680
4     5.630499
8     0.410557
7     0.117302
Name: proportion, dtype: float64
Mode: 0

No description has been provided for this image
Statistics for Light_Conditions:

Counts:
Light_Conditions
1    1326
4     377
7       1
6       1
Name: count, dtype: int64
Proportions (%):
Light_Conditions
1    77.771261
4    22.111437
7     0.058651
6     0.058651
Name: proportion, dtype: float64
Mode: 1

No description has been provided for this image
Statistics for Weather_Conditions:

Counts:
Weather_Conditions
1    1517
2     138
8      26
4      11
5       5
3       4
7       3
9       1
Name: count, dtype: int64
Proportions (%):
Weather_Conditions
1    88.973607
2     8.093842
8     1.524927
4     0.645161
5     0.293255
3     0.234604
7     0.175953
9     0.058651
Name: proportion, dtype: float64
Mode: 1

No description has been provided for this image
Statistics for Road_Surface_Conditions:

Counts:
Road_Surface_Conditions
1    1420
2     265
4      16
3       4
Name: count, dtype: int64
Proportions (%):
Road_Surface_Conditions
1    83.284457
2    15.542522
4     0.938416
3     0.234604
Name: proportion, dtype: float64
Mode: 1

No description has been provided for this image
Statistics for Special_Conditions_at_Site:

Counts:
Special_Conditions_at_Site
0    1681
4      12
6       8
5       1
7       1
1       1
3       1
Name: count, dtype: int64
Proportions (%):
Special_Conditions_at_Site
0    98.592375
4     0.703812
6     0.469208
5     0.058651
7     0.058651
1     0.058651
3     0.058651
Name: proportion, dtype: float64
Mode: 0

No description has been provided for this image
Statistics for Carriageway_Hazards:

Counts:
Carriageway_Hazards
0    1687
7       7
2       5
1       3
6       2
3       1
Name: count, dtype: int64
Proportions (%):
Carriageway_Hazards
0    98.944282
7     0.410557
2     0.293255
1     0.175953
6     0.117302
3     0.058651
Name: proportion, dtype: float64
Mode: 0

No description has been provided for this image
Statistics for Urban_or_Rural_Area:

Counts:
Urban_or_Rural_Area
1    1659
2      46
Name: count, dtype: int64
Proportions (%):
Urban_or_Rural_Area
1    97.302053
2     2.697947
Name: proportion, dtype: float64
Mode: 1

No description has been provided for this image
Statistics for Did_Police_Officer_Attend_Scene_of_Accident:

Counts:
Did_Police_Officer_Attend_Scene_of_Accident
1    1397
2     295
3      13
Name: count, dtype: int64
Proportions (%):
Did_Police_Officer_Attend_Scene_of_Accident
1    81.935484
2    17.302053
3     0.762463
Name: proportion, dtype: float64
Mode: 1

No description has been provided for this image
Statistics for LSOA_of_Accident_Location:

Counts:
LSOA_of_Accident_Location
E01002968    195
E01002984     59
E01002948     57
E01003003     50
E01003004     45
            ... 
E01030388      1
E01003445      1
E01002937      1
E01003817      1
E01002923      1
Name: count, Length: 100, dtype: int64
Proportions (%):
LSOA_of_Accident_Location
E01002968    11.436950
E01002984     3.460411
E01002948     3.343109
E01003003     2.932551
E01003004     2.639296
               ...    
E01030388     0.058651
E01003445     0.058651
E01002937     0.058651
E01003817     0.058651
E01002923     0.058651
Name: proportion, Length: 100, dtype: float64
Mode: E01002968

No description has been provided for this image
InΒ [26]:
from scipy.stats import skew, kurtosis

# Select numerical columns from my_data_cleaned
numerical_cols = my_data_cleaned.select_dtypes(include='number').columns

for col in numerical_cols:
    print(f"Statistics for {col}:\n")
    
    # Calculate statistics
    value_range = (my_data_cleaned[col].min(), my_data_cleaned[col].max())
    avg = my_data_cleaned[col].mean()
    std_dev = my_data_cleaned[col].std()
    skewness = skew(my_data_cleaned[col])
    kurt = kurtosis(my_data_cleaned[col])
    mode = my_data_cleaned[col].mode()[0]
    
    print(f"Range: {value_range}")
    print(f"Mean: {avg}")
    print(f"Standard Deviation: {std_dev}")
    print(f"Skewness: {skewness}")
    print(f"Kurtosis: {kurt}")
    print(f"Mode: {mode}\n")
    
    # Visualization: Histogram
    plt.figure(figsize=(8, 4))
    sns.histplot(my_data_cleaned[col], bins=30, color='skyblue', kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()
Statistics for Longitude:

Range: (np.float64(-0.322592), np.float64(-0.240029))
Mean: -0.28657742873900294
Standard Deviation: 0.019574295133454777
Skewness: 0.5346755208852578
Kurtosis: -0.9840962485637044
Mode: -0.318182

No description has been provided for this image
Statistics for Latitude:

Range: (np.float64(51.340366), np.float64(51.436819))
Mean: 51.396396457478005
Standard Deviation: 0.019169678043049053
Skewness: -0.5224307522997643
Kurtosis: 0.10574098794478148
Mode: 51.341169

No description has been provided for this image
Statistics for Number_of_Vehicles:

Range: (np.int64(1), np.int64(8))
Mean: 1.790615835777126
Standard Deviation: 0.6215278451588202
Skewness: 1.2766063409035726
Kurtosis: 9.268785771091464
Mode: 2

No description has been provided for this image
Statistics for Number_of_Casualties:

Range: (np.int64(1), np.int64(1))
Mean: 1.0
Standard Deviation: 0.0
Skewness: nan
Kurtosis: nan
Mode: 1

C:\Users\USER\AppData\Local\Temp\ipykernel_23516\1517211522.py:13: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.
  skewness = skew(my_data_cleaned[col])
C:\Users\USER\AppData\Local\Temp\ipykernel_23516\1517211522.py:14: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.
  kurt = kurtosis(my_data_cleaned[col])
No description has been provided for this image

Data PreparationΒΆ